## prepare arguments based on the config file
REF = config['ref_fa']
## if file ends with .gz, remove the suffix (which will trigger unzipping the fasta)
if REF.endswith('.gz'):
    REF = REF[:-3]

#### Enumerate all the output TSV files
EVALOUT = ['prcurve', 'persize']
REGS = config['regions'].split()
EVAL = config['eval'].split()
EXPS  = config['exp'].split()
METHODS  = config['methods'].split()
SAMPLES = config['samples'].split()
if 'check_inv' not in config:
    config['check_inv'] = False
INV = config['check_inv']
OUT_PREFIX =  config['out_prefix']
if 'min_cov' not in config:
    config['min_cov'] = 0.5
MIN_COV=config['min_cov']
if 'envm' not in config:
    config['envm'] = {'bgzip': 'bgzip',
                      'bcftools': 'bcftools',
                      'sveval': 'sveval'}
                      

## Simple repeat track?
if 'simprep_bed' in config:
    SR_BED = config['simprep_bed']
else:
    SR_BED = 'NA'

## Rules
rule all:
    input:
        expand('out/{exp}-{method}-{sample}-{region}-{eval}-{evalout}.tsv', exp=EXPS,
               method=METHODS, sample=SAMPLES, region=REGS, eval=EVAL, evalout=EVALOUT)
    output:
        pdf=OUT_PREFIX + ".pdf",
        pr=OUT_PREFIX + "-persize.tsv",
        size=OUT_PREFIX + "-prcurve.tsv"
    params:
        filelist=OUT_PREFIX + ".filelist.txt"
    shell:
        """
        echo {input} | sed 's/ /\\n/g' > {params.filelist}
        Rscript -e "sveval::wrapper('mergetsvs')" -output {OUT_PREFIX} -tsvs {params.filelist}
        rm {params.filelist}
        """

# Clean all the TSV and RData for the specified configuration. E.g. to rerun everything with a new sveval version.
rule clean:
    params:
        tsv=expand('out/{exp}-{method}-{sample}-{region}-{eval}-{evalout}.tsv',
                   exp=EXPS, method=METHODS,
                   sample=SAMPLES, region=REGS, eval=EVAL, evalout=EVALOUT),
        rdata=expand('out/sveval-{exp}-{method}-{sample}-{region}-{eval}.RData',
                     exp=EXPS, method=METHODS,
                     sample=SAMPLES, region=REGS, eval=EVAL)
    shell:
        'rm -rf {params.tsv} {params.rdata} {OUT_PREFIX}.pdf {OUT_PREFIX}-prcurve.tsv {OUT_PREFIX}-persize.tsv'

# unzip reference fasta
rule unzip_reference:
    input: '{ref}.fa{sta}.gz'
    output: '{ref}.fa{sta, (sta)?}'
    shell: "gunzip -c {input} > {output}"

# bgzip a VCF file
rule bgzip:
    input: '{vcf}.vcf'
    output: '{vcf}.vcf.gz'
    envmodules:
        config['envm']['bgzip']
    shell:
        'bgzip -c {input} > {output}'

# normalize a VCF using bcftools
rule vcfnorm:
    input:
        vcf="vcf/{exp}-{method}-{sample}.vcf.gz",
        ref=REF
    output:
        "vcf/{exp}-{method}-{sample}.norm.vcf.gz"
    envmodules:
        config['envm']['bgzip'], config['envm']['bcftools']
    shell:
        """
        bcftools view {input.vcf} --exclude \'GT="0" || GT="." || GT="1"\' | bcftools norm - --fasta-ref {input.ref} --multiallelic -both | bcftools norm - --fasta-ref {input.ref} --multiallelic +both | bgzip > {output}
        """

# function to eventually define more complex file inputs (e.g. to analyze different datasets with different reference versions)
def evalinputs(wildcards):
    ins = {}
    ins['vcf'] = 'vcf/{}-{}-{}.norm.vcf.gz'.format(wildcards.exp, wildcards.method,
                                                   wildcards.sample)
    ins['truth'] = 'vcf/{}-truth-baseline.norm.vcf.gz'.format(wildcards.exp)
    if wildcards.region in config:
        ins['bed'] = config[wildcards.region]        
    return ins

# run sveval on inputs. Creates a R object in 'rdata' folder
rule sveval:
    input:
        unpack(evalinputs)
    output:
        rdata= "out/{exp}-{method}-{sample}-{region}-{eval}-sveval.RData",
        pr="out/{exp}-{method}-{sample}-{region}-{eval}-prcurve.tsv",
        ps="out/{exp}-{method}-{sample}-{region}-{eval}-persize.tsv"
    params:
        outprefix="out/{exp}-{method}-{sample}-{region}-{eval}",
        bed=lambda wildcards, input: 'NA' if wildcards.region not in config else input['bed']
    resources:
        mem_mb=8000,
        runtime=30
    envmodules:
        config['envm']['sveval']
    shell:
        """
        Rscript -e  "sveval::wrapper('sveval')" -calls {input.vcf} -truth {input.truth} -sample {wildcards.sample} -inversion {INV} -region {params.bed} -simprep {SR_BED} -eval {wildcards.eval} -output {params.outprefix} -minol {MIN_COV}
        """
